1 Executive Summary

Study Phase 1b Clinical Trial (NCT03299946)
Intervention cabozantinib + nivolumab
Sponsor/Collab John Hopkins + Exelixis
Disease hepatocellular carcinoma (HCC)
Tissue Liver
Spatial Data Accession GSE238264
Sample Numbers 15 patients. (5 achieved pathologic response)
Platform 10x Genomics Visium spatial transcriptomics

What is Cabozantinib

Click for Details
  • An oral targeted therapy
  • Works by acting as a multi-tyrosine kinase inhibitor targeting VEGFR2, c-Met, RET, TIE2, and AXL involved in angiogenesis and tumor growth.
  • By inhibiting these pathways, it starves tumors of blood supply and blocks metastatic signaling.


What is Nivolumab

Click for Details
  • A monoclonal Antibody.
  • Blocks the PD-1 immune checkpoint receptor on T cells.
  • By Inhibiting the PD-1/PD-L1 interaction, it prevents cancer cells from turning off the immune response, essentially removing the “brakes” on T cell activation
  • Allows the patient’s own immune system to recognize and attack tumor cells.


Code
# Core analysis packages
library(Seurat)
library(SeuratObject)
library(ggplot2)
library(dplyr)
library(tidyr)
library(data.table)

# Spatial transcriptomics specific
library(spatstat)
library(Matrix)
library(rjson)
library(cowplot)
library(RColorBrewer)
library(viridis)
library(DT)
library(hdf5r)

# Single cell integration
library(harmony)
library(clustree)
library(scran)
library(scater)

# Drug discovery and pathway analysis
library(clusterProfiler)
library(org.Hs.eg.db)
library(ReactomePA)
library(GSVA)
library(msigdbr)
library(enrichplot)

# Cell-cell communication
library(CellChat)
library(nichenetr)

# Statistical analysis
library(ComplexHeatmap)
library(circlize)
library(corrplot)
library(survival)
library(survminer)

# Biomarker discovery
library(randomForest)
library(caret)
library(pROC)
library(ROCR)

# Set theme for consistent plotting
theme_set(
  theme_bw() + theme(
    plot.title = element_text(
      size = 14
      , face = "bold"
    )
    , axis.title = element_text(size = 12)
    , axis.text = element_text(size = 10)
    , legend.text = element_text(size = 10)
  )
)

# Set random seed for reproducibility
set.seed(42)
Code
# Create directory structure
workDir <- "G:/SpatialOmics/Tx_Drug_Discovery"

dirs <- c(
  "data/raw/spatial"
  , "data/raw/single_cell"
  , "data/processed"
  , "results/figures"
  , "results/tables"
  , "results/biomarkers"
  , "results/drug_targets"
)

sapply(dirs, function(dir) {
  full_path <- file.path(workDir, dir)
  if (!dir.exists(full_path)) {
    dir.create(
      path = full_path
      , recursive = TRUE
    )
  }
})

2 Spatial Transcriptomics Data (GSE238264)

2.1 ST - Get Metadata

Code
# Function to download and load spatial transcriptomics data
load_spatial_metadata <- function(
    geo_accession = "some accession"
    , save_dir = "path/to/save/dir"
) {
  
  message(
    "Loading spatial transcriptomics data from "
    , geo_accession
  )
  # Load required packages
  if (!require("GEOquery", quietly = TRUE)) {
    BiocManager::install("GEOquery")
    library(GEOquery)
  }
  
  message("Downloading spatial transcriptomics data from ", geo_accession)
  
  # Download GEO data
  gse <- getGEO(geo_accession, GSEMatrix = TRUE, AnnotGPL = FALSE)
  
  # Extract expression data and sample metadata
  if (length(gse) > 1) {
    # Multiple platforms - take the first one or specify which one you want
    eset <- gse[[1]]
  } else {
    eset <- gse[[1]]
  }
  
  # Extract expression matrix
  expression_data <- exprs(eset)
  
  # Extract sample metadata
  sample_metadata <- pData(eset)
  
  # Save the raw data
  write.csv(expression_data, 
            file = file.path(save_dir, "expression_matrix.csv"))
  write.csv(sample_metadata, 
            file = file.path(save_dir, "sample_metadata.csv"), 
            row.names = FALSE)
  
  message("Downloaded data for ", ncol(expression_data), " samples")
  message("Expression data: ", nrow(expression_data), " features")
  
  return(list(
    expression_data = expression_data,
    sample_metadata = sample_metadata
  ))
  
  return(sample_metadata)
}

# Load the sample metadata
raw_spatial_data <- load_spatial_metadata(  
  geo_accession = "GSE238264"
  , save_dir = file.path(workDir, "data/raw/spatial" )
)

# TODO: Manual Clean up
sample_metadata<- raw_spatial_data$sample_metadata  %>%
  # Delete columns with unchanging values
  dplyr::select(where(~ length(unique(.x)) > 1)) %>% 
  dplyr::select(geo=geo_accession, response=`phenotype:ch1`, sample_id=supplementary_file_1) %>% 
  mutate( response = ifelse(response %in% "responder", "Responder", "Non_Responder")
          , sample_id = gsub(".*_|.tar.gz", "", sample_id)
          # Here I make up some metadata for analysis purposes
          , pathologic_response = ifelse(response %in% "Responder", "Major", "None")
          , cabozantinib_dose = rep("60mg_daily", 7)
          , nivolumab_dose = rep("240mg_Q2W", 7)
          , treatment_duration = rep("8_weeks", 7)
          , tumor_size_baseline = c(8.2, 6.5, 12.1, 9.8, 7.3, 11.2, 13.5)
          , tumor_size_post = c(1.2, 1.8, 2.1, 2.3, 9.8, 12.1, 8.2)
          , age = c(62, 58, 71, 66, 59, 73, 68)
          , etiology = c("HBV", "HCV", "NASH", "HBV", "Alcohol", "HCV", "HBV")
          
  )

head(sample_metadata %>% dplyr::select(sample_id, response, tumor_size_baseline, tumor_size_post, age, etiology))
##            sample_id      response tumor_size_baseline tumor_size_post age etiology
## GSM7661255     HCC1R     Responder                 8.2             1.2  62      HBV
## GSM7661256     HCC2R     Responder                 6.5             1.8  58      HCV
## GSM7661257     HCC3R     Responder                12.1             2.1  71     NASH
## GSM7661258     HCC4R     Responder                 9.8             2.3  66      HBV
## GSM7661259    HCC5NR Non_Responder                 7.3             9.8  59  Alcohol
## GSM7661260    HCC6NR Non_Responder                11.2            12.1  73      HCV
Code
fetch_eset <- function(geo_accession="someAccession"
                       , save_dir = "path/to/save/dir") {
  message("Downloading HCC scRNA-seq reference data from ", geo_accession)
  
  # Download GEO metadata
  gse <- (getGEO(geo_accession, GSEMatrix = TRUE, AnnotGPL = FALSE))
  
  if (length(gse) > 1) {
    eset <- gse[[1]]  # Take first platform
  } else {
    eset <- gse[[1]]
  }
  
  #Get expression data
  options(timeout = 400)  # Increase timeout
  
  getGEOSuppFiles(geo_accession, makeDirectory = TRUE, baseDir = save_dir ) 
}

2.2 ST - Get Spatial Data

Code
#~~~~~~~~~~~~~~~~~~~
# Load Esets
#~~~~~~~~~~~~~~~~~~
rawfiles <- file.path(workDir,"data/raw/spatial/GSE238264") 

sample_paths <- list.files(rawfiles, full.names = T)
sample_names <- basename(sample_paths)

# Function to load multiple 10x Visium samples
load_multiple_spatial_samples <- function(sample_paths, sample_names) {
  
  spatial_list <- list()
  
  for(i in 1:length(sample_paths)) {
    cat("Loading", sample_names[i], "...\n")
    
    # Load 10x Visium data
    spatial_data <- Load10X_Spatial(
      data.dir = sample_paths[i],
      filename = "filtered_feature_bc_matrix.h5",  # or matrix.mtx.gz
      assay = "Spatial",
      slice = sample_names[i]
    )
    
    # Add sample metadata
    spatial_data$sample_id <- sample_names[i]
    #spatial_data$patient_id <- sample_names[i]  # Adjust based on your naming
    
    spatial_list[[sample_names[i]]] <- spatial_data
  }
  
  return(spatial_list)
}

# Load all samples
spatial_samples <- load_multiple_spatial_samples(
  sample_paths = sample_paths,
  sample_names = sample_names
)
## Loading HCC1R ...
## Loading HCC2R ...
## Loading HCC3R ...
## Loading HCC4R ...
## Loading HCC5NR ...
## Loading HCC6NR ...
## Loading HCC7NR ...

2.3 ST - Quality Control

Code
# Function to calculate QC metrics
calculate_spatial_qc <- function(spatial_obj) {
  
  # Basic metrics
  spatial_obj$nCount_Spatial <- colSums(spatial_obj@assays$Spatial$counts)
  spatial_obj$nFeature_Spatial <- colSums(spatial_obj@assays$Spatial$counts > 0)
  
  # Mitochondrial genes
  spatial_obj$percent.mt <- PercentageFeatureSet(spatial_obj, pattern = "^MT-|^mt")
  
  # Ribosomal genes
  spatial_obj$percent.ribo <- PercentageFeatureSet(spatial_obj, pattern = "^RP[SL]|rp[sl]")
  
  # Hemoglobin genes (contamination)
  spatial_obj$percent.hb <- PercentageFeatureSet(spatial_obj, pattern = "^HB[^(P)]|hb[^(p)]")
  
  # Log10 transformation for better visualization
  spatial_obj$log10_nCount <- log10(spatial_obj$nCount_Spatial + 1)
  spatial_obj$log10_nFeature <- log10(spatial_obj$nFeature_Spatial + 1)
  
  return(spatial_obj)
}

# Apply QC to all samples
spatial_samples <- lapply(spatial_samples, calculate_spatial_qc)

2.3.1 Visualise QC Metrics

Code
# Calculate QC Metrics
theme_spatial <- theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
    axis.text.y = element_text(size = 8),
    axis.title = element_text(size = 10),
    plot.title = element_text(size = 11),
    legend.text = element_text(size = 8),
    strip.text = element_text(size = 9)
  )


# Function for comprehensive QC plots
plot_spatial_qc <- function(spatial_list, sample_metadata) {
  
  library(ggplot2)
  library(patchwork)
  
  # Extract QC metrics from all samples
  qc_data <- data.frame()
  
  for(sample_name in names(spatial_list)) {
    obj <- spatial_list[[sample_name]]
    sample_qc <- data.frame(
      sample_id = sample_name,
      nCount_Spatial = obj$nCount_Spatial,
      nFeature_Spatial = obj$nFeature_Spatial,
      percent.mt = obj$percent.mt,
      percent.ribo = obj$percent.ribo,
      log10_nCount = obj$log10_nCount,
      log10_nFeature = obj$log10_nFeature
    )
    qc_data <- rbind(qc_data, sample_qc)
  }
  
  # Merge with response metadata
  qc_data <- merge(qc_data, sample_metadata, by = "sample_id")
  
  # Calculate correlations for annotation
  correlations <- qc_data %>%
    group_by(sample_id, response) %>%
    summarise(
      correlation = cor(nCount_Spatial, nFeature_Spatial),
      .groups = 'drop'
    )
  
  # 1. Violin plots for key metrics
  p1 <- ggplot(qc_data, aes(x = sample_id, y = nCount_Spatial, fill = response)) +
    geom_violin() + geom_boxplot(width = 0.1) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title = "UMI Count per Spot", y = "nCount_Spatial") + theme_spatial
  
  p2 <- ggplot(qc_data, aes(x = sample_id, y = nFeature_Spatial, fill = response)) +
    geom_violin() + geom_boxplot(width = 0.1) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title = "Genes per Spot", y = "nFeature_Spatial") + theme_spatial
  
  p3 <- ggplot(qc_data, aes(x = sample_id, y = percent.mt, fill = response)) +
    geom_violin() + geom_boxplot(width = 0.1) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title = "Mitochondrial %", y = "percent.mt") + theme_spatial
  
  # 2. Scatter plot with correlation coefficients
  p4 <- ggplot(qc_data, aes(x = nCount_Spatial, y = nFeature_Spatial, color = response)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "lm", se = TRUE, alpha = 0.3) +  # Add trend lines
    facet_wrap(~sample_id, scales = "free") +
    geom_text(
      data = correlations,
      aes(label = paste("r =", round(correlation, 3))),
      x = Inf, y = Inf, hjust = 1.1, vjust = 1.1,
      color = "black", size = 3, inherit.aes = FALSE
    ) +
    labs(title = "UMI vs Genes Correlation per Sample") +
    theme_spatial
  
  p5 <- ggplot(qc_data, aes(x = nCount_Spatial, y = percent.mt, color = response)) +
    geom_point(alpha = 0.6) +
    facet_wrap(~sample_id, scales = "free") +
    labs(title = "UMI vs Mitochondrial %") + theme_spatial
  
  # 3. Summary statistics
  summary_stats <- qc_data %>%
    group_by(sample_id, response) %>%
    summarise(
      median_UMI = median(nCount_Spatial),
      mean_UMI = mean(nCount_Spatial),
      median_genes = median(nFeature_Spatial),
      median_mt = median(percent.mt),
      n_spots = n(),
      .groups = 'drop'
    )
  
  p6 <- ggplot(summary_stats, aes(x = sample_id, y = mean_UMI, fill = response)) +
    geom_col() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title = "Mean UMI per Sample", y = "Mean UMI Count") + theme_spatial
  
  p7 <- ggplot(summary_stats, aes(x = sample_id, y = n_spots, fill = response)) +
    geom_col() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title = "Number of Spots per Sample", y = "Number of Spots") + theme_spatial
  
  # Combine plots
  combined_plot1 <- p1
  combined_plot2 <- p2
  combined_plot3 <- p3
  combined_plot4 <- (p6 | p7) 
  combined_plot5 <- (p4) 
  combined_plot6 <- (p5)
  
  return(list(
    combined_plot1 = combined_plot1,
    combined_plot2 = combined_plot2,
    combined_plot3 = combined_plot3,
    combined_plot4 = combined_plot4,
    combined_plot5 = combined_plot6,
    qc_data = qc_data,
    summary_stats = summary_stats
  ))
}

# Generate QC plots
qc_results <- plot_spatial_qc(spatial_samples, sample_metadata)
print(qc_results$combined_plot1)

Code
print(qc_results$combined_plot2)

Code
print(qc_results$combined_plot3)

Code
print(qc_results$combined_plot4)

Code
print(qc_results$combined_plot5)

Code
print(qc_results$combined_plot6)
## NULL
Code
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Spatial Distribution of QC Metrics 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# Plot QC metrics overlaid on tissue
plot_spatial_qc_overlay <- function(spatial_obj, sample_name) {
  
  p1 <- SpatialFeaturePlot(spatial_obj, features = "nCount_Spatial") +
    ggtitle(paste(sample_name, "- UMI Count"))  + theme_spatial
  
  p2 <- SpatialFeaturePlot(spatial_obj, features = "nFeature_Spatial") +
    ggtitle(paste(sample_name, "- Gene Count"))  + theme_spatial
  
  p3 <- SpatialFeaturePlot(spatial_obj, features = "percent.mt") +
    ggtitle(paste(sample_name, "- Mitochondrial %"))  + theme_spatial
  
  combined <- (p1 | p2) / p3
  return(combined)
}

# Plot for each sample
for(sample_name in names(spatial_samples)) {
  cat("Plotting spatial QC for", sample_name, "\n")
  p <- plot_spatial_qc_overlay(spatial_samples[[sample_name]], sample_name)
  print(p)
}
## Plotting spatial QC for HCC1R

## Plotting spatial QC for HCC2R

## Plotting spatial QC for HCC3R

## Plotting spatial QC for HCC4R

## Plotting spatial QC for HCC5NR

## Plotting spatial QC for HCC6NR

## Plotting spatial QC for HCC7NR

VERDICT: Remove HCC2R, as it seems have been very poorly sequences. Both Library Size and Genes detected is low in this sample. Most expression occur in edges which is a red flag.

2.3.2 Low Feature Spots

Code
sapply(
  names(spatial_samples)
  , function(dataset_name) {
    
    seurat_obj <- spatial_samples[[dataset_name]]
    
    # Sum up number of spots where a gene has non-zero count
    gene_detection_rates <- Matrix::rowSums(
      GetAssayData(
        seurat_obj
        , slot = "counts"
      ) > 0
    )
    
    data.frame(
      `**Genes detected in >50% of spots:**` = sum(gene_detection_rates > ncol(seurat_obj)/2)
      , `**Genes detected in >10% of spots:**` = sum(gene_detection_rates > ncol(seurat_obj)/10)
      , `**Genes detected in <5% of spots:**` = sum(gene_detection_rates < ncol(seurat_obj)/20)
      , check.names = FALSE
    ) %>% 
      t() %>% 
      knitr::kable(caption=dataset_name) %>%
      print()
    
    cat("\n")
  }
  , simplify = FALSE
)
## 
## 
## Table: (\#tab:unnamed-chunk-3)HCC1R
## 
## |                                     |      |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** |   461|
## |**Genes detected in >10% of spots:** |  4165|
## |**Genes detected in <5% of spots:**  | 29494|
## 
## 
## 
## Table: (\#tab:unnamed-chunk-3)HCC2R
## 
## |                                     |      |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** |    69|
## |**Genes detected in >10% of spots:** |   889|
## |**Genes detected in <5% of spots:**  | 34221|
## 
## 
## 
## Table: (\#tab:unnamed-chunk-3)HCC3R
## 
## |                                     |      |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** |   419|
## |**Genes detected in >10% of spots:** |  4742|
## |**Genes detected in <5% of spots:**  | 28874|
## 
## 
## 
## Table: (\#tab:unnamed-chunk-3)HCC4R
## 
## |                                     |      |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** |  2742|
## |**Genes detected in >10% of spots:** | 10701|
## |**Genes detected in <5% of spots:**  | 24131|
## 
## 
## 
## Table: (\#tab:unnamed-chunk-3)HCC5NR
## 
## |                                     |      |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** |   827|
## |**Genes detected in >10% of spots:** |  4660|
## |**Genes detected in <5% of spots:**  | 29447|
## 
## 
## 
## Table: (\#tab:unnamed-chunk-3)HCC6NR
## 
## |                                     |      |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** |  3289|
## |**Genes detected in >10% of spots:** | 10215|
## |**Genes detected in <5% of spots:**  | 24651|
## 
## 
## 
## Table: (\#tab:unnamed-chunk-3)HCC7NR
## 
## |                                     |      |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** |  1040|
## |**Genes detected in >10% of spots:** |  6003|
## |**Genes detected in <5% of spots:**  | 27880|
## $HCC1R
## NULL
## 
## $HCC2R
## NULL
## 
## $HCC3R
## NULL
## 
## $HCC4R
## NULL
## 
## $HCC5NR
## NULL
## 
## $HCC6NR
## NULL
## 
## $HCC7NR
## NULL

2.3.3 Outlier Detection

Code
outliers_df <- data.frame() 
sapply(
  names(spatial_samples)
  , function(dataset_name) {
    
    seurat_obj <- spatial_samples[[dataset_name]]
    
    # UMI outliers using 1.5 * IQR rule
    Q1_umi <- quantile(seurat_obj@meta.data$nCount_Spatial, 0.25)
    Q3_umi <- quantile(seurat_obj@meta.data$nCount_Spatial, 0.75)
    IQR_umi <- Q3_umi - Q1_umi
    umi_outliers <- sum(seurat_obj@meta.data$nCount_Spatial < (Q1_umi - 1.5*IQR_umi) |
                          seurat_obj@meta.data$nCount_Spatial > (Q3_umi + 1.5*IQR_umi))
    
    # Feature outliers
    Q1_feat <- quantile(seurat_obj@meta.data$nFeature_Spatial, 0.25)
    Q3_feat <- quantile(seurat_obj@meta.data$nFeature_Spatial, 0.75)
    IQR_feat <- Q3_feat - Q1_feat
    feat_outliers <- sum(seurat_obj@meta.data$nFeature_Spatial < (Q1_feat - 1.5*IQR_feat) |
                           seurat_obj@meta.data$nFeature_Spatial > (Q3_feat + 1.5*IQR_feat))
    
    tmp <- data.frame(
      # UMI outliers (1.5×IQR rule)
      umi_outliers =paste(umi_outliers, "spots (", round(100*umi_outliers/ncol(seurat_obj), 1), "%)")
      # Feature outliers (1.5×IQR rule)
      , feat_outliers= paste(feat_outliers, "spots (", round(100*feat_outliers/ncol(seurat_obj), 1), "%)")
    )
    
    rbind(tmp,outliers_df )
  }
)
##               HCC1R                HCC2R                 HCC3R               HCC4R            HCC5NR              HCC6NR               HCC7NR             
## umi_outliers  "240 spots ( 8 %)"   "379 spots ( 13.7 %)" "15 spots ( 0.7 %)" "0 spots ( 0 %)" "23 spots ( 1.7 %)" "128 spots ( 5 %)"   "60 spots ( 2.4 %)"
## feat_outliers "127 spots ( 4.2 %)" "225 spots ( 8.1 %)"  "0 spots ( 0 %)"    "0 spots ( 0 %)" "61 spots ( 4.6 %)" "124 spots ( 4.8 %)" "28 spots ( 1.1 %)"

VERDICT: Only 69 house keeping genes (>50% spot) in HCC2R, marked for removal.

2.3.4 Predict Gender

Code
# Define sex-specific marker genes
sex_markers <- list(
  # Y chromosome genes (male-specific)
  Y_genes = c("RPS4Y1", "EIF1AY", "KDM5D", "UTY", "USP9Y", "DDX3Y", "TMSB4Y"),
  
  # X chromosome markers
  X_genes = c("XIST"),  # X-inactivation gene (female-specific)
  
  # Additional sex-linked genes
  sex_genes = c("SRY", "ZFY", "AMELX", "AMELY")
)

# Function to predict gender from spatial data
predict_gender_spatial <- function(spatial_obj, sex_markers) {
  
  # Get expression data
  expr_data <- GetAssayData(spatial_obj, slot = "counts")
  
  # Calculate Y gene expression score
  Y_genes_present <- intersect(sex_markers$Y_genes, rownames(expr_data))
  
  if(length(Y_genes_present) > 0) {
    Y_score <- colMeans(expr_data[Y_genes_present, , drop = FALSE])
  } else {
    Y_score <- rep(0, ncol(expr_data))
  }
  
  # Calculate XIST expression (female marker)
  if("XIST" %in% rownames(expr_data)) {
    XIST_score <- expr_data["XIST", ]
  } else {
    XIST_score <- rep(0, ncol(expr_data))
  }
  
  # Create prediction scores
  spatial_obj$Y_gene_score <- Y_score
  spatial_obj$XIST_score <- XIST_score
  
  # Predict gender based on thresholds
  spatial_obj$predicted_gender <- ifelse(
    Y_score > 0.1, "Male",
    ifelse(XIST_score > 0.2, "Female", "Uncertain")
  )
  
  
  
  return(spatial_obj)
}

# Apply to all samples
spatial_samples_gender <- lapply(spatial_samples, function(obj) {
  predict_gender_spatial(obj, sex_markers)
})

# Extract gender predictions from all samples into one dataframe
compile_gender_predictions <- function(spatial_samples_gender) {
  
  results_df <- data.frame()
  
  for(sample_name in names(spatial_samples_gender)) {
    
    # Get the table of predictions for this sample
    gender_table <- table(spatial_samples_gender[[sample_name]]$predicted_gender)
    
    # Convert to dataframe with sample info
    sample_df <- data.frame(
      sample_id = sample_name,
      gender = names(gender_table),
      count = as.numeric(gender_table),
      proportion = round(as.numeric(gender_table) / sum(gender_table), 3)
    )
    
    results_df <- rbind(results_df, sample_df)
  }
  
  return(results_df)
}

genderpred <- compile_gender_predictions(spatial_samples_gender) %>%
  filter(!gender %in% "Uncertain") %>% 
  group_by(sample_id) %>% 
  slice_max(order_by = count, n = 1) %>%
  ungroup() %>% 
  dplyr::select(sample_id, gender)

# Assign gender to gender column
for(i in 1:nrow(genderpred)) {
  sample_name <- genderpred$sample_id[i]
  if(sample_name %in% names(spatial_samples)) {
    spatial_samples[[sample_name]]$gender <- genderpred$gender[i]
  }
}

genderpred %>% knitr::kable()
sample_id gender
HCC1R Male
HCC2R Male
HCC3R Female
HCC4R Male
HCC5NR Female
HCC6NR Female
HCC7NR Female

2.3.5 Filtering

min_nCount_Spatial 500
min_nFeature_Spatial 200
max_nFeature_Spatial 8000
max_percent.mt 20
Code
min_nCount_Spatial=500
min_nFeature_Spatial=200
max_nFeature_Spatial=6000
max_percent.mt=20
#~~~~~~~~~~~~~~~~~~~#
# Apply Filter
#~~~~~~~~~~~~~~~~~~~#
# Apply consistent filtering across all samples
filter_spatial_samples <- function(spatial_list, thresholds) {
  
  filtered_samples <- list()
  
  for(sample_name in names(spatial_list)) {
    obj <- spatial_list[[sample_name]]
    
    cat("Filtering", sample_name, "...\n")
    cat("  Before:", ncol(obj), "spots\n")
    
    # Apply filters
    obj <- subset(obj, 
                  subset = nCount_Spatial >= min_nCount_Spatial & 
                    nFeature_Spatial >= min_nFeature_Spatial &
                    nFeature_Spatial <= max_nFeature_Spatial &
                    percent.mt <= max_percent.mt)
    
    cat("  After:", ncol(obj), "spots\n")
    
    filtered_samples[[sample_name]] <- obj
  }
  
  return(filtered_samples)
}


# Apply filtering
spatial_samples_filt <- filter_spatial_samples(spatial_samples, filtering_thresholds)
## Filtering HCC1R ...
##   Before: 3006 spots
##   After: 2577 spots
## Filtering HCC2R ...
##   Before: 2766 spots
##   After: 1207 spots
## Filtering HCC3R ...
##   Before: 2170 spots
##   After: 1802 spots
## Filtering HCC4R ...
##   Before: 3002 spots
##   After: 1974 spots
## Filtering HCC5NR ...
##   Before: 1320 spots
##   After: 1248 spots
## Filtering HCC6NR ...
##   Before: 2575 spots
##   After: 2338 spots
## Filtering HCC7NR ...
##   Before: 2453 spots
##   After: 2450 spots
Code
# Remove slice: HCC2R
spatial_samples_filt[["HCC2R"]] <-NULL

Note: 1 Sample (HCC2R) Removed


2.4 ST - Pre Processing

Code
if (!file.exists(file.path(workDir, "/data/processed", "spatial_samples_sct.RDS"))) {
  # Function to apply SCTransform to all slices
  apply_sctransform_to_all_slices <- function(seurat_obj, assay_of_interest = "Spatial") {
    
    # Get all slice names
    slice_names <- names(seurat_obj@images)
    
    message("Found ", length(slice_names), " slices:", paste(slice_names, collapse = ", "), "\n")
    
    # Apply SCTransform to the entire object (works on all slices)
    message("Applying SCTransform normalization...\n")
    
    seurat_obj <- SCTransform(
      seurat_obj                    # Seurat object
      , assay = assay_of_interest   # input assay name
      , method = "poisson"          # use poisson model for spatial data
      , verbose = FALSE             # reduce output
    )
    
    message("SCTransform normalization completed successfully!\n")
    message("New assay 'SCT' has been added to the Seurat object.\n")
    
    # Set default assay to SCT for downstream analysis
    DefaultAssay(seurat_obj) <- "SCT"
    return(seurat_obj)
  }
  
  spatial_samples_sct <- sapply( spatial_samples_filt, apply_sctransform_to_all_slices)
  
  saveRDS(spatial_samples_sct, file.path(workDir, "/data/processed", "spatial_samples_sct.RDS"))
} else  {
  spatial_samples_sct <- readRDS(file.path(workDir, "/data/processed", "spatial_samples_sct.RDS"))
}

2.5 ST - Dim reduction

Code
if (!file.exists(file.path(workDir, "/data/processed", "spatial_samples_sct_dimr.RDS"))) {
  dimreduce <- function(seurat_obj, assay_of_interest = "Spatial") {
    # 4. Perform PCA
    seurat_obj <- RunPCA(
      seurat_obj                           # Seurat object
      , assay = "SCT"                      # use SCT assay
      , verbose = FALSE                    # reduce output
    )
    
    # 5. Find neighbors and clusters
    seurat_obj <- FindNeighbors(
      seurat_obj      # Seurat object
      , reduction = "pca"  # use PCA for neighbor finding
      , dims = 1:30   # use first 30 PCs
    )
    
    # Finding Gene Clusters
    seurat_obj <- FindClusters(
      seurat_obj
      #, resolution = 0.5
      , verbose = FALSE
    )
    
    # 6. Run UMAP
    seurat_obj <- RunUMAP(
      seurat_obj      # Seurat object
      , reduction = "pca"  # use PCA for UMAP
      , dims = 1:30   # use first 10 PCs
      , verbose = FALSE    # reduce output
    )
  }
  
  spatial_samples_sct_dimr <- sapply(spatial_samples_sct, dimreduce)
  saveRDS(spatial_samples_sct_dimr, file.path(workDir, "/data/processed", "spatial_samples_sct_dimr.RDS"))
} else  {
  spatial_samples_sct <- readRDS(file.path(workDir, "/data/processed", "spatial_samples_sct_dimr.RDS"))
}

3 Single Cell Reference Data Preparations

Data Type Single Cell RNA-Seq
DataSet GSE149614
Publication Yan et al

3.1 sc - Download Data

Code
if(!file.exists(file.path(workDir, "/data/processed", "scrna_seuratRaw.RDS")) & !file.exists(file.path(workDir, "/data/processed", "hcc_scrna_reference.RDS")) ) {
  
  geo_accession="GSE149614"
  
  fetch_eset( geo_accession , save_dir=file.path(workDir,"data/raw/single_cell") )
  
  #~~~~~~~~~~~~~~~~~~~~~~#
  # Build Seurat Object
  #~~~~~~~~~~~~~~~~~~~~~~#
  # Set file paths
  scdata_dir <- file.path(workDir,"data/raw/single_cell", "GSE149614")
  count_file <- file.path(scdata_dir, "GSE149614_HCC.scRNAseq.S71915.count.txt.gz")
  metadata_file <- file.path(scdata_dir, "GSE149614_HCC.metadata.updated.txt.gz")
  
  
  # Get the header line (cell names)
  header_line <- readLines(count_file, n = 1)
  cell_names <- strsplit(header_line, "\t")[[1]]
  
  
  # Read the data starting from line 2
  message("Reading count matrix...")
  counts <- fread(count_file, header = FALSE, skip = 1, data.table = FALSE)
  
  # Set proper dimensions
  gene_names <- counts[,1]  # First column is gene names
  counts <- as.matrix(counts[,-1])  # Remove gene column, convert to matrix
  rownames(counts) <- gene_names
  colnames(counts) <- cell_names
  counts1 <- as.matrix(counts)
  
  # Extract expression data and sample metadata
  expression_data <- exprs(eset)
  scSample_metadata <- pData(eset)
  
  message("Processing scRNA-seq data for anchor-based integration...")
  
  # 2. Read metadata  
  message("Reading metadata...")
  metadata <- fread(metadata_file, header = TRUE, data.table = FALSE)
  rownames(metadata) <- metadata$Cell  # Match to cell names
  
  # 3. Ensure cell names match
  cat("Cell Names Match:", identical(colnames(counts), rownames(metadata) ))
  
  # 4. Create Seurat object
  message("Creating Seurat object...")
  scrna_ref_raw <- CreateSeuratObject(
    counts = counts,
    meta.data = metadata,
    project = "HCC_GSE149614",
    min.cells = 3,
    min.features = 200
  )
  
  saveRDS(scrna_ref, file.path(workDir, "/data/processed", "scrna_seuratRaw.RDS"))
} else  {
  scrna_ref <- readRDS(file.path(workDir, "/data/processed", "scrna_seuratRaw.RDS"))
}

3.2 sc - Process Data

Here we SCTransform the single cell data to keep the noise model the same as the spatial data.

Code
if(!file.exists(file.path(workDir, "/data/processed", "hcc_scrna_reference.RDS")) ) {
  # Standard scRNA-seq processing
  Idents(scrna_ref) <- "celltype"
  
  # Normalize Data
  scrna_ref <- SCTransform (
    scrna_ref
    , ncells = 3000 # learns noise models on 3000 cells, whole dataset normalised with no loss in performance
    , verbose = FALSE ) %>% 
    # PCA
    RunPCA(verbose = FALSE)  %>% 
    # UMAP
    RunUMAP( dims = 1:30 )
  
  # Save processed reference
  saveRDS(scrna_ref, file = file.path(workDir, "/data/processed", "hcc_scrna_reference.RDS"))
  
} else {
  scrna_ref <- readRDS(file.path(workDir, "/data/processed", "hcc_scrna_reference.RDS"))
}

message("scRNA-seq reference prepared for anchor-based integration")

table(scrna_ref$celltype)
## 
##           B Endothelial  Fibroblast  Hepatocyte     Myeloid        T/NK 
##        3685        3644        2266       20782       15947       25591